In [ ]:
# For torch functions
import torch

# for Neural network layers 
import torch.nn as nn

# For neural network functions:
import torch.nn.functional as F

# For Open ML datasets available in pytorch
from torchvision import transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, Subset, ConcatDataset, Dataset
from torchvision.utils import make_grid

# for Optimization function in pytorch
import torch.optim as optim

from torchsummary import summary
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly as pt
%matplotlib inline

# Speech Denoising Modules
import librosa
import IPython.display as ipd
from scipy.io import wavfile
In [ ]:
print(torch.__version__)
1.6.0+cu101
In [ ]:
print(torch.cuda.get_device_name(0))
use_cuda = torch.cuda.is_available()
print(use_cuda)
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
device
Tesla T4
True
Out[ ]:
device(type='cuda', index=0)
In [ ]:
!nvidia-smi
Wed Oct 28 01:07:57 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Problem 1: Speech Denoising Using 1D CNN

In [ ]:
# Load Audio Files
s, sr=librosa.load('train_clean_male.wav', sr=None) 
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None) 
X=librosa.stft(sn, n_fft=1024, hop_length=512)

S, X = np.abs(S), np.abs(X)

# Covert to Pytorch tensors:
S = torch.tensor(np.transpose(S)).to(device)
X = torch.tensor(np.transpose(X)).to(device)

# # Dataloaders
BATCH_SIZE = 128
TrainLoader = DataLoader(X, batch_size=BATCH_SIZE)
TestLoader = DataLoader(S, batch_size=BATCH_SIZE)
In [ ]:
ipd.Audio('train_clean_male.wav')
Out[ ]:
In [ ]:
ipd.Audio('train_dirty_male.wav')
Out[ ]:

Define Model

In [ ]:
class SpeechDenoiser(nn.Module):

    def __init__(self):
        super(SpeechDenoiser, self).__init__()

        # Convolution Layers:
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=2, stride=2),
            nn.ReLU(),       
            nn.MaxPool1d(kernel_size=2, stride=1))
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=1))
        
        # Linear Layer:
        self.fc1 = nn.Linear(4032, 2016)
        self.fc2 = nn.Linear(2016, 513)

        # Weight Initialization
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='relu')

        # Activation
        self.relu = nn.ReLU()



    def forward(self, x):
        # Pass the input tensor x through our layers:
        x = x[:, np.newaxis,:]

        # x = x.view(-1, 513).to(device)

        # Layer 1
        x = self.conv1(x)

        # Layer 2
        x = self.conv2(x)

        # First Linear Layer
        x = x.reshape(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        
        # Final Layer
        x = self.fc2(x)
        x = torch.nn.functional.relu(x)

        return x
In [ ]:
def train_model(model, epochs=200):
    # Loss Function
    criterion = nn.MSELoss()

    # Loss Optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # No:of times to train data
    loss_of_each_epoch = []

    for e in tqdm(range(epochs)):
        train_running_loss = 0.0
        train_acc = 0.0
        for (input, output) in zip(TrainLoader, TestLoader):
            # Convert the input to required format 
            input = input.to(device)
            output = output.to(device)

            # set optimizer gradients to zero:
            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
        loss_of_each_epoch.append(loss.data.cpu().numpy())
    print(loss.data)
    plt.plot(range(epochs), loss_of_each_epoch)
    plt.show()
In [ ]:
model = SpeechDenoiser().to(device)
In [ ]:
summary(model, (513,))
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv1d-1              [-1, 16, 256]              48
              ReLU-2              [-1, 16, 256]               0
         MaxPool1d-3              [-1, 16, 255]               0
            Conv1d-4              [-1, 32, 127]           1,056
              ReLU-5              [-1, 32, 127]               0
         MaxPool1d-6              [-1, 32, 126]               0
            Linear-7                 [-1, 2016]       8,130,528
              ReLU-8                 [-1, 2016]               0
            Linear-9                  [-1, 513]       1,034,721
================================================================
Total params: 9,166,353
Trainable params: 9,166,353
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.22
Params size (MB): 34.97
Estimated Total Size (MB): 35.19
----------------------------------------------------------------

Train Model

In [ ]:
train_model(model,epochs=500)
tensor(0.0004, device='cuda:0')

Save Model

In [ ]:
PATH = './Problem1.pth'
torch.save(model.state_dict(), PATH) 

# Load Model
# NN = NeuralNetwork().to(device)
# NN.load_state_dict(torch.load(PATH))
In [ ]:
def audio_test(input_file, output_file, model):
    s, sr = librosa.load(input_file, sr=None) 
    temp = librosa.stft(s, n_fft=1024, hop_length=512)
    temp_abs = torch.tensor(np.abs(temp))
    temp_abs = np.transpose(temp_abs)
    TempLoader = torch.utils.data.DataLoader(temp_abs, batch_size=temp_abs.shape[0])
    with torch.no_grad():
        for i in TempLoader:
            i = i.to(device)
            output = model(i)
        recov = (temp/np.abs(temp)) * output.detach().cpu().numpy().T
        recov_istft = librosa.istft(recov, hop_length=512)
        librosa.output.write_wav(output_file, recov_istft, sr)

Test Audio 1

In [ ]:
# Testing audio 1
audio_test('test_x_01.wav', 'test_x_01_recons_model.wav', model)
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_01.wav')
plt.plot(signalData)
ipd.Audio('test_x_01.wav')
Out[ ]:
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_01_recons_model.wav')
plt.plot(signalData)
ipd.Audio('test_x_01_recons_model.wav')
Out[ ]:

Test Audio 2

In [ ]:
# Testing audio 2
audio_test('test_x_02.wav', 'test_x_02_recons_model.wav', model)
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_02.wav')
plt.plot(signalData)
ipd.Audio('test_x_02.wav')
Out[ ]:
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_02_recons_model.wav')
plt.plot(signalData)
ipd.Audio('test_x_02_recons_model.wav')
Out[ ]:

Problem 2: Speech Denoising Using 2D CNN

In [ ]:
# Load Audio Files
s, sr=librosa.load('train_clean_male.wav', sr=None) 
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None) 
X=librosa.stft(sn, n_fft=1024, hop_length=512)

S = torch.tensor(np.abs(S)).float().T
X = torch.tensor(np.abs(X)).float().T
In [ ]:
class AudioDataset(Dataset):
    def __init__(self, data, label):
        self.train = data
        self.label = label
    
    def __len__(self):
        if len(self.train) == len(self.label):
            return len(self.train)
    
    def __getitem__(self, idx):
        return self.train[idx], self.label[idx]
In [ ]:
def data_preprocessing(input):
    output = []
    for i in range(20, input.shape[0]+1):
        output.append(input[i-20:i,:].view(1,20,513))
    return torch.cat(output)
def add_frames(input):
    silent_frames = torch.distributions.Normal(0, 0.00005)
    frames = silent_frames.sample((19,513))
    return torch.cat((frames, input))
In [ ]:
input = add_frames(X)
input = data_preprocessing(input).to(device)
output = S.to(device)
print(input.shape, output.shape)
torch.Size([2459, 20, 513]) torch.Size([2459, 513])
In [ ]:
ds = AudioDataset(input, output)
In [ ]:
train_loader = DataLoader(ds, batch_size=256, num_workers=0)
In [ ]:
x, y = next(iter(train_loader))
x = x[:, np.newaxis,:,:]
x.shape
Out[ ]:
torch.Size([256, 1, 20, 513])
In [ ]:
conv = nn.Sequential(
                nn.Conv2d(1, 4, kernel_size=2, stride=1),
                nn.ReLU(),       
                nn.AvgPool2d(kernel_size=2, stride=1),
            
                nn.Conv2d(4, 8, kernel_size=2, stride=1),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=1),
        
                nn.Conv2d(8, 16, kernel_size=2, stride=2),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=2)).to(device)
o = conv(x)
o.shape
Out[ ]:
torch.Size([256, 16, 4, 127])
In [ ]:
        self.conv = nn.Sequential(
                nn.Conv2d(1, 8, kernel_size=2, stride=1),
                nn.ReLU(),       
                nn.AvgPool2d(kernel_size=2, stride=1),
            
                nn.Conv2d(8, 16, kernel_size=2, stride=1),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=1),
        
                nn.Conv2d(16, 32, kernel_size=2, stride=2),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=2))

        self.fc1 = nn.Linear(in_features=16256, out_features=8000)
        self.fc2 = nn.Linear(in_features=8000, out_features=1000)
        self.fc3 = nn.Linear(in_features=1000, out_features=513)
Out[ ]:
8128
In [ ]:
 class SpeechDenoiser2(nn.Module):
  
    def __init__(self):
        super(SpeechDenoiser2, self).__init__()

        # Convolution Layers:
        self.conv1 = nn.Sequential(
                nn.Conv2d(1, 8, kernel_size=2, stride=1),
                nn.ReLU(),       
                nn.MaxPool2d(kernel_size=2, stride=1))

        self.conv2 = nn.Sequential(
                nn.Conv2d(8, 16, kernel_size=2, stride=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=1))
        
        self.conv3 = nn.Sequential(
                nn.Conv2d(16, 32, kernel_size=2, stride=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
        
        # Linear Layer:
        self.fc1 = nn.Linear(16256, 8000)
        self.fc2 = nn.Linear(8000, 1000)
        self.fc3 = nn.Linear(1000, 513)

        # # Weight Intialization:
        # for m in self.modules():
        #     if isinstance(m, nn.Linear):
        #         nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='relu')


        # Activation
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)



    def forward(self, x):
        # Data
        x = x[:, np.newaxis,:,:]

        # Layer1
        x = self.conv1(x)

        # Layer2
        x = self.conv2(x)

        # Layer3
        x = self.conv3(x)

        # First Linear Layer
        x = x.reshape(x.shape[0], -1)
        # x = self.fc1(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)

        # second LInear Layer
        x = self.relu(self.fc2(x))
        x = self.dropout(x)

        # Final Layer
        x = self.fc3(x)
        x = torch.nn.functional.relu(x)

        return x 
In [ ]:
def train_model(model, epochs=200):
    # Loss Function
    criterion = nn.MSELoss()

    # Loss Optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    # optimizer = optim.SGD(model.parameters(), lr=0.1)

    # No:of times to train data
    loss_of_each_epoch = []

    for e in tqdm(range(epochs)):
        train_running_loss = 0.0
        train_acc = 0.0
        for (input, output) in train_loader:
            # set optimizer gradients to zero:
            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
        loss_of_each_epoch.append(loss.data.cpu().numpy())
    print(loss.data)
    plt.plot(range(epochs), loss_of_each_epoch)
    plt.show()
In [ ]:
model = SpeechDenoiser2().to(device)
In [ ]:
summary(model, (20,513))
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1           [-1, 8, 19, 512]              40
              ReLU-2           [-1, 8, 19, 512]               0
         MaxPool2d-3           [-1, 8, 18, 511]               0
            Conv2d-4          [-1, 16, 17, 510]             528
              ReLU-5          [-1, 16, 17, 510]               0
         MaxPool2d-6          [-1, 16, 16, 509]               0
            Conv2d-7           [-1, 32, 8, 254]           2,080
              ReLU-8           [-1, 32, 8, 254]               0
         MaxPool2d-9           [-1, 32, 4, 127]               0
           Linear-10                 [-1, 8000]     130,056,000
             ReLU-11                 [-1, 8000]               0
          Dropout-12                 [-1, 8000]               0
           Linear-13                 [-1, 1000]       8,001,000
             ReLU-14                 [-1, 1000]               0
          Dropout-15                 [-1, 1000]               0
           Linear-16                  [-1, 513]         513,513
================================================================
Total params: 138,573,161
Trainable params: 138,573,161
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 6.19
Params size (MB): 528.61
Estimated Total Size (MB): 534.84
----------------------------------------------------------------
In [ ]:
train_model(model,epochs=150)
tensor(0.0045, device='cuda:0')

Save Model

In [ ]:
PATH = './Problem2.pth'
torch.save(model.state_dict(), PATH) 

# # Load Model
# model = SpeechDenoiser2().to(device)
# model.load_state_dict(torch.load(PATH))
In [ ]:
def audio_test(input_file, output_file, model):
    # Load Data
    s, sr = librosa.load(input_file, sr=None) 
    temp = librosa.stft(s, n_fft=1024, hop_length=512)

    # Create a DataLoader
    temp_ = torch.tensor(np.abs(temp)).float().T
    temp_ = add_frames(temp_)
    temp_ = data_preprocessing(temp_)
    TempLoader = DataLoader(temp_, batch_size=len(temp_))

    # Predictions
    with torch.no_grad():
        # each_array = [0] * 513
        # output_list = np.array([each_array] * 19)
        # output_list = []
        for i in TempLoader:
            i = i.to(device)
            output = model(i)
            # output_list = output
            output_list = output.cpu().numpy()
            # output_list = np.concatenate((output_list, output), 0)
        
        # Recover Audio from predictions
        recov = (temp/np.abs(temp)) * output_list.T
        recov_istft = librosa.istft(recov, hop_length=512)
        librosa.output.write_wav(output_file, recov_istft, sr)
        return output_list

Test Audio

In [ ]:
# Testing audio 1
out = audio_test('test_x_01.wav', 'test_x_01_recons_model2.wav', model)
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_01.wav')
plt.plot(signalData)
ipd.Audio('test_x_01.wav')
Out[ ]:
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_01_recons_model2.wav')
plt.plot(signalData)
ipd.Audio('test_x_01_recons_model2.wav')
Out[ ]:
In [ ]:
# Testing audio 2
out = audio_test('test_x_02.wav', 'test_x_02_recons_model2.wav', model)
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_02.wav')
plt.plot(signalData)
ipd.Audio('test_x_02.wav')
Out[ ]:
In [ ]:
samplingFrequency, signalData = wavfile.read('test_x_02_recons_model2.wav')
plt.plot(signalData)
ipd.Audio('test_x_02_recons_model2.wav')
Out[ ]:
In [ ]:
pred = audio_test('train_dirty_male.wav', 'train_dirty_male_recons.wav', model)
sam = S.cpu().numpy()
print(f'SNR of Train Audio => {10*np.log10(np.sum(np.square(sam))/(np.sum(np.square(np.subtract(sam,pred)))))}')
SNR of Train Audio => 10.009456872940063

Problem 3: Data Augmentation

In [ ]:
# Transformation 
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Load train data
trainset = CIFAR10(root='./data', train=True, download=True, transform=transform)
Files already downloaded and verified
In [ ]:
# Divide Data into train and validation sets.
index = list(range(len(trainset)))
v_ind, t_ind = index[:5000], index[5000:]

trainloder = DataLoader(Subset(trainset, t_ind), batch_size=200, num_workers=2)
validloader = DataLoader(Subset(trainset, v_ind), batch_size=5000, num_workers=2)

Build your baseline CNN classifier.

In [ ]:
class BaseCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolution Layers:
        self.conv1 = nn.Sequential(
                nn.Conv2d(3, 10, kernel_size=5, stride=1),
                nn.ReLU(),       
                nn.MaxPool2d(kernel_size=2, stride=2))

        self.conv2 = nn.Sequential(
                nn.Conv2d(10,10, kernel_size=5, stride=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
        
        # Linear Layer:
        self.fc1 = nn.Linear(10*5*5, 20)
        self.fc2 = nn.Linear(20, 10)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='relu')


    def forward(self, x):
        # print(x.shape)
        # Layer1
        x = self.conv1(x)

        # Layer2
        x = self.conv2(x)

        # Layer 3
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        x = nn.functional.relu(x)

        # Final Layer
        x = self.fc2(x)

        return x 
In [ ]:
model = BaseCNN().to(device)
In [ ]:
def get_accuracy(output, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()

def train_model(epochs=200, model=None, trainloder=None, validloader=None):
    # Loss Function
    criterion = nn.CrossEntropyLoss()

    # Loss Optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # No:of times to train data
    train_acc = []
    valid_acc = []
    for e in tqdm(range(epochs)):
        train_acc_temp = 0.0
        for i, (input, output) in enumerate(trainloder):
            input, output = input.to(device), output.to(device)

            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
            train_acc_temp += get_accuracy(predictions, output, 200)
        train_acc.append(train_acc_temp/i)

        # Validation Step.
        with torch.no_grad():
            for images, labels in validloader:
                images, labels = images.to(device), labels.to(device)
                output = model(images)
                valid_acc.append(get_accuracy(output, labels, 5000))
    return epochs, valid_acc, train_acc
In [ ]:
epoch, valid_acc, train_acc = train_model(model=model, epochs=100, trainloder=trainloder, validloader=validloader)

In [ ]:
sns.set_theme()
epoch = list(range(100))
plt.figure(figsize=(15, 10))
va = plt.plot(epoch, valid_acc, label='valid_acc')
ta = plt.plot(epoch, train_acc, label='train_acc')
plt.legend(['base_valid_acc', 'base_train_acc'])
plt.title("Accuracy of Non Augmented Data")
plt.xlabel('Number of Epochs')
plt.ylabel("Accuracy (%)")
plt.show()
In [ ]:
epoch = list(range(100))
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch, y=valid_acc,
                    mode='lines',
                    name='valid_acc'))
fig.add_trace(go.Scatter(x=epoch, y=train_acc,
                    mode='lines',
                    name='train_acc'))
# fig.title
fig.update_layout(
    title="Accuracy of Non Augmented Data",
    xaxis_title="Number of Epochs",
    yaxis_title="Accuracy (%)",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=900,
    height=600
)

fig.show()
In [ ]:
# Accuracy on Test Data
with torch.no_grad():
    for images, labels in validloader:
        images, labels = images.to(device), labels.to(device)
        output = model(images)
        test_acc = get_accuracy(output, labels, 5000)
print(f'Accuracy of the network on the 5000 validation images: {test_acc}%')
Accuracy of the network on the 5000 validation images: 64.41999816894531%

Build another classifier using augmented dataset.

In [ ]:
# Brightness Transformation
brighter = transforms.Compose(
    [transforms.ToTensor(),
     transforms.ColorJitter(brightness=(1.1,1.1)),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Darker Transformation
darker = transforms.Compose(
    [transforms.ToTensor(),
     transforms.ColorJitter(brightness=(0.9,0.9)),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Horizontal Flip
flip = transforms.Compose(
    [transforms.ToTensor(),
    transforms.RandomHorizontalFlip(p=1),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
In [ ]:
data_aug1 = CIFAR10(root='./aug1', train=True, download=True, transform=brighter)
data_aug2 = CIFAR10(root='./aug2', train=True, download=True, transform=darker)
data_aug3 = CIFAR10(root='./aug3', train=True, download=True, transform=flip)
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./aug1/cifar-10-python.tar.gz
Extracting ./aug1/cifar-10-python.tar.gz to ./aug1
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./aug2/cifar-10-python.tar.gz
Extracting ./aug2/cifar-10-python.tar.gz to ./aug2
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./aug3/cifar-10-python.tar.gz
Extracting ./aug3/cifar-10-python.tar.gz to ./aug3
In [ ]:
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.figure(figsize=(15,150))
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# get some random training images
n = 11
images = [data_aug1[n][0], data_aug2[n][0], data_aug3[n][0], trainset[n][0]]

# show images
imshow(make_grid(images))
In [ ]:
data = ConcatDataset([Subset(data_aug1, t_ind), Subset(data_aug2, t_ind), Subset(data_aug3, t_ind), Subset(trainset, t_ind)])
In [ ]:
#input_batch shape: (64, in_channels, 224, 224)
images = []
labels = []
for ch in data:
    images.append(ch[0])
    labels.append(ch[1])
# result = torch.cat(outputs, dim=1)  #shape (64, 32*in_channels, 224, 224)
In [ ]:
images = torch.cat(images, dim=0)
In [ ]:
images = images.reshape(180000, 3, 32, 32)
In [ ]:
labels = torch.LongTensor(labels)
In [ ]:
class AugmentedCIFAR10(Dataset):
    """Augmented CIFAR10 dataset."""

    def __init__(self, images, labels):
        # Load Data
        self.images =  images 
        self.labels = labels

        self.images, self.labels = self.images.to(device), self.labels.to(device)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img, label = self.images[idx], self.labels[idx]
        return img, label
In [ ]:
cifar_data = AugmentedCIFAR10(images=images, labels=labels)
In [ ]:
aug_trainloader = DataLoader(cifar_data, batch_size=200, num_workers=0)
aug_validloader = DataLoader(Subset(trainset, v_ind), batch_size=5000, num_workers=0)
In [ ]:
aug_model = BaseCNN().to(device) # Augmented Model
In [ ]:
def get_accuracy(output, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()

def train_model(epochs=200, model=None, trainloder=None, validloader=None):
    # Loss Function
    criterion = nn.CrossEntropyLoss()

    # Loss Optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # No:of times to train data
    train_acc = []
    valid_acc = []
    for e in tqdm(range(epochs)):
        train_acc_temp = 0.0
        for i, (input, output) in enumerate(trainloder):
            # input, output = input.to(device), output.to(device)

            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
            train_acc_temp += get_accuracy(predictions, output, 200)
        train_acc.append(train_acc_temp/i)

        # Validation Step.
        with torch.no_grad():
            for images, labels in validloader:
                images, labels = images.to(device), labels.to(device)
                output = model(images)
                valid_acc.append(get_accuracy(output, labels, 5000))
    return epochs, valid_acc, train_acc
In [ ]:
epochs, aug_valid_acc, aug_train_acc = train_model(model=aug_model, epochs=100, trainloder=aug_trainloader, validloader=aug_validloader)

In [ ]:
sns.set_theme()
epoch = list(range(100))
plt.figure(figsize=(15, 10))
va = plt.plot(epoch, valid_acc, label='valid_acc')
ta = plt.plot(epoch, train_acc, label='train_acc')
ava = plt.plot(epoch, aug_valid_acc, label='aug_valid_acc')
ata = plt.plot(epoch, aug_train_acc, label='aug_train_acc')
plt.legend(['base_valid_acc', 'base_train_acc', 'aug_valid_acc', 'aug_train_acc'])
plt.title("Accuracy of Augmented Vs Non Augmented Data")
plt.xlabel('Number of Epochs')
plt.ylabel("Accuracy (%)")
plt.show()
In [ ]:
epoch = list(range(100))
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch, y=valid_acc,
                    mode='lines',
                    name='valid_acc'))
fig.add_trace(go.Scatter(x=epoch, y=train_acc,
                    mode='lines',
                    name='train_acc'))
fig.add_trace(go.Scatter(x=epoch, y=aug_valid_acc,
                    mode='lines', name='aug_valid_acc'))
fig.add_trace(go.Scatter(x=epoch, y=aug_train_acc,
                    mode='lines', name='aug_train_acc'))
# fig.title
fig.update_layout(
    title="Accuracy of Augmented Vs Non Augmented Data",
    xaxis_title="Number of Epochs",
    yaxis_title="Accuracy (%)",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=900,
    height=600
)

fig.show()

Save Variables

In [ ]:
variables = {'valid_acc':valid_acc,
             'train_acc': train_acc,
             'aug_train_acc': aug_train_acc,
             'aug_valid_acc': aug_valid_acc}
In [ ]:
import pickle

# with open('problem3.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     pickle.dump(variables, f, pickle.HIGHEST_PROTOCOL)

with open('problem3.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    variables = pickle.load(f)

valid_acc = variables['valid_acc']
train_acc = variables['train_acc']
aug_train_acc = variables['aug_train_acc']
aug_valid_acc = variables['aug_valid_acc']
In [ ]:
 

Self-Supervised Learning via Pretext Tasks

In [ ]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = CIFAR10(root='./data', train=True, download=True, transform=transform)
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
Extracting ./data/cifar-10-python.tar.gz to ./data
In [ ]:
index = list(range(50000))
unlab_ind, lab_ind = index[:49500], index[49500:]
In [ ]:
labeled_data = Subset(trainset, lab_ind)
unlabeled_data = Subset(trainset, unlab_ind)
In [ ]:
ind = [[] for _ in range(10)]
for lab in labeled_data:
    ind[lab[1]].append(lab[1])
for i in ind:
    print(f'{i[0]} count is {len(i)}')

The pretext task:

In [ ]:
# Vertically Flipping
vertical_flip = transforms.Compose(
    [transforms.ToTensor(),
     transforms.RandomVerticalFlip(p=1),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Rotate 90 degree counter clock wise
rotate90 = transforms.Compose([transforms.RandomRotation([90,90]),
                               transforms.ToTensor(),
                               transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
In [ ]:
data_aug1 = CIFAR10(root='./aug41', train=True, download=True, transform=vertical_flip)
data_aug2 = CIFAR10(root='./aug42', train=True, download=True, transform=rotate90)
Files already downloaded and verified
Files already downloaded and verified
In [ ]:
len(unlab_ind)*3
Out[ ]:
148500
In [ ]:
for data in unlabeled_data:
    print(data[0].shape)
    break
torch.Size([3, 32, 32])
In [ ]:
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.figure(figsize=(15,150))
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# get some random training images
n = 49499-100
images = [data_aug1[n][0], data_aug2[n][0], unlabeled_data[n][0]]

# show images
imshow(make_grid(images))
In [ ]:
# data = ConcatDataset([Subset(trainset, unlab_ind), Subset(data_aug1, unlab_ind), Subset(data_aug2, unlab_ind)])
data_aug1 = Subset(data_aug1, unlab_ind)
data_aug2 = Subset(data_aug2, unlab_ind)
In [ ]:
images = []
labels = []

# Unlabeled Data
for data in unlabeled_data:
    images.append(data[0])
    labels.append(0)
print("Unlabelled Data Done")

# Data Aug 1
for data in data_aug1:
    images.append(data[0])
    labels.append(1)
print("Vertical Flipping Done")

# Data Aug 2
for data in data_aug2:
    images.append(data[0])
    labels.append(2)
print("90 degree rotation Done")
Unlabelled Data Done
Vertical Flipping Done
90 degree rotation Done
In [ ]:
for i in range(len(images)):
    images[i] = images[i].reshape(1,3,32,32)
In [ ]:
images = torch.cat(images, dim=0)
In [ ]:
images = images.reshape(148500, 3, 32, 32)
In [ ]:
labels = torch.LongTensor(labels)
In [ ]:
class UnlabeledCIFAR(Dataset):
    """Augmented CIFAR10 dataset."""

    def __init__(self, images, labels):
        # Load Data
        self.images =  images 
        self.labels = labels

        self.images, self.labels = self.images.to(device), self.labels.to(device)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img, label = self.images[idx], self.labels[idx]
        return img, label
In [ ]:
unlabeled_data = UnlabeledCIFAR(images=images, labels=labels)
In [ ]:
unlab_trainloader = DataLoader(unlabeled_data, batch_size=512, num_workers=0, shuffle=True)
In [ ]:
class UnlabledCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolution Layers:
        self.conv1 = nn.Sequential(
                nn.Conv2d(3, 10, kernel_size=5, stride=1),
                nn.ReLU(),       
                nn.MaxPool2d(kernel_size=2, stride=2))

        self.conv2 = nn.Sequential(
                nn.Conv2d(10,10, kernel_size=5, stride=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
        
        # Linear Layer:
        self.fc1 = nn.Linear(10*5*5, 20)
        self.fc2 = nn.Linear(20, 3)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='relu')


    def forward(self, x):
        # Layer1
        x = self.conv1(x)

        # Layer2
        x = self.conv2(x)

        # Layer 3
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        x = nn.functional.relu(x)

        # Final Layer
        x = self.fc2(x)

        return x 
In [ ]:
unlabel_model = UnlabledCNN().to(device)
In [ ]:
def get_accuracy(output, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()

    
# Loss Function
criterion = nn.CrossEntropyLoss()

# Loss Optimizer
optimizer = optim.Adam(unlabel_model.parameters(), lr = 0.001)

train_acc = []
# No:of times to train data
for e in tqdm(range(100)):
    train_acc_temp = 0
    for i, (input, output) in enumerate(unlab_trainloader):
        # input, output = input.to(device), output.to(device)

        optimizer.zero_grad()

        predictions = unlabel_model(input) # Output predictions
        loss = criterion(predictions, output) # Loss  Caluclation
        loss.backward() # Pass loss function gradients to pervious layers:
        optimizer.step() # Update Weights
        train_acc_temp += get_accuracy(predictions, output, 256)
    train_acc.append(train_acc_temp/i)
print(loss.item())
0.4139147400856018
In [ ]:
train_acc = np.array(train_acc)
train_acc = train_acc/2
In [ ]:
PATH = './Pretext.pth'
torch.save(unlabel_model.state_dict(), PATH) 

# Load Model
# unlabel_model = UnlabledCNN().to(device)
# unlabel_model.load_state_dict(torch.load(PATH))
In [ ]:
test_acc = 0
with torch.no_grad():
    for i, (images, labels) in enumerate(unlab_trainloader):
        # images, labels = images.to(device), labels.to(device)
        output = unlabel_model(images)
        test_acc += get_accuracy(output, labels, 512)
print(f'Accuracy of the network on the 5000 validation images: {test_acc/i}%')
Accuracy of the network on the 5000 validation images: 77.24811422413794%
Observations:
  • The accuracy of the pretext model is 77.248

The baseline:

In [ ]:
class BaseCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolution Layers:
        self.conv1 = nn.Sequential(
                nn.Conv2d(3, 10, kernel_size=5, stride=1),
                nn.ReLU(),       
                nn.MaxPool2d(kernel_size=2, stride=2))

        self.conv2 = nn.Sequential(
                nn.Conv2d(10,10, kernel_size=5, stride=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
        
        # Linear Layer:
        self.fc1 = nn.Linear(10*5*5, 20)
        self.fc2 = nn.Linear(20, 10)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                print()
                nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in', nonlinearity='relu')


    def forward(self, x):
        # Layer1
        x = self.conv1(x)

        # Layer2
        x = self.conv2(x)

        # Layer 3
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        x = nn.functional.relu(x)

        # Final Layer
        x = self.fc2(x)

        return x 
In [ ]:
model = BaseCNN().to(device)
In [ ]:
trainloader = DataLoader(labeled_data, batch_size=500)
testset = CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=10000)
Files already downloaded and verified
In [ ]:
def get_accuracy(output, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()

def train_model(epochs=200, model=None, trainloder=None, validloader=None):
    # Loss Function
    criterion = nn.CrossEntropyLoss()

    # Loss Optimizer
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # No:of times to train data
    # train_acc = []
    valid_acc = []
    for e in tqdm(range(epochs)):
        # train_acc_temp = 0.0
        for i, (input, output) in enumerate(trainloder):
            # input, output = input.to(device), output.to(device)

            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
            # train_acc_temp += get_accuracy(predictions, output, 200)
        # train_acc.append(train_acc_temp/i)

        # Validation Step.
        if e%100 == 0:
            with torch.no_grad():
                for images, labels in validloader:
                    images, labels = images.to(device), labels.to(device)
                    output = model(images)
                    valid_acc.append(get_accuracy(output, labels, 10000))
    return valid_acc
In [ ]:
base_valid_acc = train_model(model=model, epochs=10000, trainloder=trainloader, validloader=testloader)

In [ ]:
sns.set_theme()
epoch = list(range(0, 10000, 100))
plt.figure(figsize=(15, 10))
plt.plot(epoch, base_valid_acc, label='base_valid_acc')
plt.legend(['base_valid_acc'])
plt.title("Test Accuracy of Baseline Model")
plt.xlabel('Number of Epochs')
plt.ylabel("Accuracy (%)")
plt.show()
In [ ]:
epoch = list(range(0, 10000, 100))
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch, y=base_valid_acc,
                    mode='lines',
                    name='base_valid_acc'))

# fig.title
fig.update_layout(
    title="Test Accuracy of BaseLine model",
    xaxis_title="Number of Epochs",
    yaxis_title="Accuracy (%)",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=1000,
    height=700
)

fig.show()
In [ ]:
import pickle
# variables = {'base_valid_acc': base_valid_acc}
# with open('problem4base_acc.pickle', 'wb') as f:
#     # Pickle the 'data' dictionary using the highest protocol available.
#     pickle.dump(variables, f, pickle.HIGHEST_PROTOCOL)

with open('problem4base_acc.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    variables = pickle.load(f)
base_valid_acc = variables['base_valid_acc']

The transfer learning task:

In [545]:
PATH = './Pretext.pth'
pretrain_model = UnlabledCNN().to(device)
pretrain_model.load_state_dict(torch.load(PATH))
Out[545]:
<All keys matched successfully>
In [546]:
pretrain_model.fc2 = nn.Linear(in_features=20, out_features=10, bias=True).to(device)
In [547]:
nn.init.kaiming_uniform_(pretrain_model.fc2.weight, mode='fan_in', nonlinearity='relu')
Out[547]:
Parameter containing:
tensor([[ 0.2020,  0.3483, -0.3317,  0.4937, -0.2322, -0.3488, -0.2582,  0.0510,
         -0.2508, -0.4528,  0.1640, -0.2180,  0.4468,  0.2931,  0.4508,  0.3176,
         -0.2941, -0.4537, -0.0221, -0.0540],
        [ 0.2477, -0.2812,  0.1500,  0.4603, -0.4513,  0.0605,  0.0887,  0.4836,
         -0.0932,  0.2507,  0.0817, -0.3898, -0.3603, -0.4427,  0.4257,  0.4946,
         -0.3040, -0.1103,  0.0669, -0.1089],
        [ 0.1845,  0.2236,  0.2095, -0.5348, -0.3494,  0.4528, -0.3673,  0.0179,
          0.2210, -0.1633, -0.4462, -0.4715, -0.1478, -0.2172, -0.4125, -0.3245,
         -0.4368,  0.1716, -0.3481, -0.2318],
        [-0.2464,  0.4524,  0.0119, -0.3988, -0.3822, -0.3497,  0.5148,  0.1258,
          0.0035, -0.2576,  0.1076,  0.0256, -0.3562,  0.0499,  0.0548, -0.3078,
          0.1551,  0.2888,  0.1595,  0.2119],
        [-0.4198, -0.5466,  0.0666, -0.3408,  0.5213, -0.4301, -0.3134, -0.2447,
          0.1788, -0.2959, -0.1145,  0.2417,  0.1545, -0.4066,  0.4623, -0.5390,
         -0.2802,  0.3612,  0.5079,  0.0835],
        [ 0.0865, -0.2890, -0.2021, -0.3338,  0.1455, -0.2988, -0.0408, -0.3835,
         -0.4397,  0.5097, -0.4784,  0.2926, -0.4414,  0.0851,  0.0961,  0.0177,
         -0.1532,  0.3708, -0.3209,  0.0173],
        [ 0.1430,  0.0585, -0.2550,  0.2544, -0.3219, -0.5014,  0.1784, -0.1755,
          0.2808,  0.1366, -0.1131,  0.3237, -0.3387, -0.4647,  0.1449,  0.0038,
          0.1900,  0.3175, -0.4824, -0.2323],
        [ 0.2177, -0.0978,  0.1728,  0.5003, -0.1123, -0.2695,  0.4593, -0.1522,
         -0.2398, -0.5025, -0.3977,  0.2134,  0.5260,  0.3351,  0.2053, -0.5275,
          0.0429, -0.4100, -0.3786, -0.1377],
        [-0.5011,  0.3716, -0.0774,  0.4207,  0.0511,  0.1881, -0.1623,  0.1009,
          0.1480, -0.2925,  0.3462, -0.4335, -0.4457,  0.1845,  0.1516,  0.5041,
         -0.3153,  0.4704, -0.2953, -0.4202],
        [ 0.5440, -0.0098,  0.0696,  0.3900,  0.3748,  0.1900, -0.0308,  0.1360,
         -0.4096,  0.4917, -0.5064, -0.5162, -0.4180, -0.2261,  0.0753, -0.1759,
          0.1087, -0.2585, -0.2683,  0.2713]], device='cuda:0',
       requires_grad=True)
In [548]:
def get_accuracy(output, target, batch_size):
    ''' Obtain accuracy for training round '''
    corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).sum()
    accuracy = 100.0 * corrects/batch_size
    return accuracy.item()

def train_model(epochs=200, model=None, trainloder=None, validloader=None):
    # Loss Function
    criterion = nn.CrossEntropyLoss()

    # Loss Optimizer
    optimizer = optim.Adam([
                            {"params": model.conv1.parameters(), "lr": 1e-5},
                            {"params": model.conv2.parameters(), "lr": 1e-5},
                            {"params": model.fc1.parameters(), "lr": 1e-6},
                            {"params": model.fc2.parameters(), "lr": 1e-3},
                            ])

    # No:of times to train data
    # train_acc = []
    valid_acc = []
    for e in tqdm(range(epochs)):
        # train_acc_temp = 0.0
        for i, (input, output) in enumerate(trainloder):
            input, output = input.to(device), output.to(device)

            optimizer.zero_grad()

            predictions = model(input) # Output predictions
            loss = criterion(predictions, output) # Loss  Caluclation
            loss.backward() # Pass loss function gradients to pervious layers:
            optimizer.step() # Update Weights
            # train_acc_temp += get_accuracy(predictions, output, 200)
        # train_acc.append(train_acc_temp/i)

        # Validation Step.
        if e%100 == 0:
            with torch.no_grad():
                for images, labels in validloader:
                    images, labels = images.to(device), labels.to(device)
                    output = model(images)
                    valid_acc.append(get_accuracy(output, labels, 10000))
    return valid_acc
In [549]:
pretrain_valid_acc = train_model(model=pretrain_model, epochs=10000, trainloder=trainloader, validloader=testloader)

In [550]:
sns.set_theme()
epoch = list(range(0, 10000, 100))
plt.figure(figsize=(15, 10))
plt.plot(epoch, base_valid_acc, label='base_valid_acc')
plt.plot(epoch, pretrain_valid_acc, label='pretrain_valid_acc')
plt.legend(['base_valid_acc', 'pretrain_valid_acc'])
plt.title("Test Accuracy of BaseLine Vs Pretrained Model")
plt.xlabel('Number of Epochs')
plt.ylabel("Accuracy (%)")
plt.show()
In [ ]:
# epoch = list(range(0, 10000, 100))
epoch = list(range(0, 10000, 100))
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=epoch, y=pretrain_valid_acc,
                    mode='lines',
                    name='pretrain_valid_acc'))

fig.add_trace(go.Scatter(x=epoch, y=base_valid_acc,
                    mode='lines',
                    name='base_valid_acc'))

# fig.title
fig.update_layout(
    title="Test Accuracy of BaseLine Vs Pretrained Model",
    xaxis_title="Number of Epochs",
    yaxis_title="Accuracy (%)",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=1500,
    height=800
)

fig.show()

Observations:

  • Accuracy of the model starts increasing when trained with pretext model.

Save Variables

In [553]:
import pickle

variables = {'base_valid_acc': base_valid_acc,
             'pretrain_valid_acc': pretrain_valid_acc}
with open('problem4.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(variables, f, pickle.HIGHEST_PROTOCOL)

# with open('problem4.pickle', 'rb') as f:
#     # The protocol version used is detected automatically, so we do not
#     # have to specify it.
#     variables = pickle.load(f)
# print(variables)
# # base_valid_acc = variables['base_valid_acc']
# # pretrained_valid_acc = variables['pretrained_valid_acc']
In [ ]: